In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import plotly.express as px

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")
In [2]:
df = pd.read_csv('Sleep_Efficiency.csv')
In [3]:
df
Out[3]:
ID Age Gender Bedtime Wakeup time Sleep duration Sleep efficiency REM sleep percentage Deep sleep percentage Light sleep percentage Awakenings Caffeine consumption Alcohol consumption Smoking status Exercise frequency
0 1 65 Female 2021-03-06 01:00:00 2021-03-06 07:00:00 6.0 0.88 18 70 12 0.0 0.0 0.0 Yes 3.0
1 2 69 Male 2021-12-05 02:00:00 2021-12-05 09:00:00 7.0 0.66 19 28 53 3.0 0.0 3.0 Yes 3.0
2 3 40 Female 2021-05-25 21:30:00 2021-05-25 05:30:00 8.0 0.89 20 70 10 1.0 0.0 0.0 No 3.0
3 4 40 Female 2021-11-03 02:30:00 2021-11-03 08:30:00 6.0 0.51 23 25 52 3.0 50.0 5.0 Yes 1.0
4 5 57 Male 2021-03-13 01:00:00 2021-03-13 09:00:00 8.0 0.76 27 55 18 3.0 0.0 3.0 No 3.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
447 448 27 Female 2021-11-13 22:00:00 2021-11-13 05:30:00 7.5 0.91 22 57 21 0.0 0.0 0.0 No 5.0
448 449 52 Male 2021-03-31 21:00:00 2021-03-31 03:00:00 6.0 0.74 28 57 15 4.0 25.0 0.0 No 3.0
449 450 40 Female 2021-09-07 23:00:00 2021-09-07 07:30:00 8.5 0.55 20 32 48 1.0 NaN 3.0 Yes 0.0
450 451 45 Male 2021-07-29 21:00:00 2021-07-29 04:00:00 7.0 0.76 18 72 10 3.0 0.0 0.0 No 3.0
451 452 18 Male 2021-03-17 02:30:00 2021-03-17 10:00:00 7.5 0.63 22 23 55 1.0 50.0 0.0 No 1.0

452 rows × 15 columns

In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      452 non-null    int64  
 1   Age                     452 non-null    int64  
 2   Gender                  452 non-null    object 
 3   Bedtime                 452 non-null    object 
 4   Wakeup time             452 non-null    object 
 5   Sleep duration          452 non-null    float64
 6   Sleep efficiency        452 non-null    float64
 7   REM sleep percentage    452 non-null    int64  
 8   Deep sleep percentage   452 non-null    int64  
 9   Light sleep percentage  452 non-null    int64  
 10  Awakenings              432 non-null    float64
 11  Caffeine consumption    427 non-null    float64
 12  Alcohol consumption     438 non-null    float64
 13  Smoking status          452 non-null    object 
 14  Exercise frequency      446 non-null    float64
dtypes: float64(6), int64(5), object(4)
memory usage: 53.1+ KB
In [5]:
df.describe()
Out[5]:
ID Age Sleep duration Sleep efficiency REM sleep percentage Deep sleep percentage Light sleep percentage Awakenings Caffeine consumption Alcohol consumption Exercise frequency
count 452.000000 452.000000 452.000000 452.000000 452.000000 452.000000 452.000000 432.000000 427.000000 438.000000 446.000000
mean 226.500000 40.285398 7.465708 0.788916 22.615044 52.823009 24.561947 1.641204 23.653396 1.173516 1.791480
std 130.625419 13.172250 0.866625 0.135237 3.525963 15.654235 15.313665 1.356762 30.202785 1.621377 1.428134
min 1.000000 9.000000 5.000000 0.500000 15.000000 18.000000 7.000000 0.000000 0.000000 0.000000 0.000000
25% 113.750000 29.000000 7.000000 0.697500 20.000000 48.250000 15.000000 1.000000 0.000000 0.000000 0.000000
50% 226.500000 40.000000 7.500000 0.820000 22.000000 58.000000 18.000000 1.000000 25.000000 0.000000 2.000000
75% 339.250000 52.000000 8.000000 0.900000 25.000000 63.000000 32.500000 3.000000 50.000000 2.000000 3.000000
max 452.000000 69.000000 10.000000 0.990000 30.000000 75.000000 63.000000 4.000000 200.000000 5.000000 5.000000
In [6]:
df.shape
Out[6]:
(452, 15)
In [7]:
df.isna().sum()
Out[7]:
ID                         0
Age                        0
Gender                     0
Bedtime                    0
Wakeup time                0
Sleep duration             0
Sleep efficiency           0
REM sleep percentage       0
Deep sleep percentage      0
Light sleep percentage     0
Awakenings                20
Caffeine consumption      25
Alcohol consumption       14
Smoking status             0
Exercise frequency         6
dtype: int64
In [8]:
df.rename(columns = {'Wakeup time':'Wakeup_time', 'Sleep duration':'Sleep_duration ',"Sleep efficiency":"Sleep_efficiency",
                     "REM sleep percentage":"REM_sleep_percentage","Deep sleep percentage":"Deep_sleep_percentage",
                     "Light sleep percentage":"Light_sleep_percentage","Caffeine consumption":"Caffeine_consumption",
                     "Alcohol consumption":"Alcohol_consumption","Smoking status":"Smoking_status","Exercise frequency":"Exercise_frequency"}, inplace = True)

EDA¶

In [9]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(method='spearman'), annot=True, fmt='.0%')
plt.show()
In [10]:
# Show the distribution of values in each column of the dataframe
df.hist(figsize=(20, 20))
plt.show()
In [11]:
plt.scatter(df.Age, df['Sleep_efficiency'])
plt.xlabel('age')
plt.ylabel('Sleep efficiency')
Out[11]:
Text(0, 0.5, 'Sleep efficiency')
In [12]:
sns.histplot(df['Sleep_efficiency'],bins=10, kde=True)
Out[12]:
<AxesSubplot:xlabel='Sleep_efficiency', ylabel='Count'>
In [13]:
df.Gender.value_counts()
Out[13]:
Male      228
Female    224
Name: Gender, dtype: int64
In [14]:
sns.countplot(data=df,x="Gender", color="green")
plt.xlabel("Female or Male", color="blue",fontsize=10)
plt.ylabel("Count", color="blue",fontsize=10)
plt.show()
In [15]:
df.Age.value_counts()
Out[15]:
52    19
27    19
37    18
48    17
40    17
      ..
15     1
11     1
9      1
16     1
12     1
Name: Age, Length: 61, dtype: int64
In [16]:
sns.relplot(
    data=df, kind="line",
    x="Age", y="Sleep_efficiency", style="Gender", color="purple"
)
plt.show()
In [17]:
df.Smoking_status.value_counts()
Out[17]:
No     298
Yes    154
Name: Smoking_status, dtype: int64
In [18]:
sns.countplot(data=df,x="Smoking_status", color="pink")
plt.xlabel("Yes or No", color="blue",fontsize=10)
plt.ylabel("Count", color="blue",fontsize=10)
plt.title("number of smokers and non-smokers", color="blue",fontsize=10)
plt.show()
In [19]:
df.Exercise_frequency.value_counts()
Out[19]:
3.0    130
0.0    116
1.0     97
2.0     54
4.0     41
5.0      8
Name: Exercise_frequency, dtype: int64
In [20]:
sns.kdeplot(data=df, x="Exercise_frequency",color="brown",fill=True)
plt.xlabel("Exercise Frequency", color="brown", fontsize=10)
plt.ylabel("frequency", color="brown", fontsize=10)
plt.title("Exercise Frequency Kdeplot", color="brown",fontsize=10)
plt.show()
In [21]:
df.Light_sleep_percentage.value_counts()
Out[21]:
20    52
15    49
17    46
18    45
10    34
13    32
12    28
45    20
21    17
52    16
47    16
53    13
55    13
16    12
19    11
48    10
54     8
50     7
22     4
56     3
51     3
14     3
7      3
11     2
62     1
46     1
30     1
40     1
63     1
Name: Light_sleep_percentage, dtype: int64
In [22]:
sns.kdeplot(data=df, x="Light_sleep_percentage",color="gray",fill=True)
plt.xlabel("Light sleep percentage", color="gray", fontsize=10)
plt.ylabel("frequency", color="gray", fontsize=10)
plt.title("Light sleep percentage kdeplot", color="gray",fontsize=10)
plt.show()
In [23]:
df.REM_sleep_percentage.value_counts()
Out[23]:
20    92
22    67
28    58
23    56
18    49
25    31
24    26
27    25
26    15
15    14
19    11
30     7
21     1
Name: REM_sleep_percentage, dtype: int64
In [24]:
sns.kdeplot(data=df, x="REM_sleep_percentage",color="black",fill=True)
plt.xlabel("Sleep Duration", color="black", fontsize=10)
plt.ylabel("frequency", color="black", fontsize=10)
plt.title("Sleep duration kdeplot", color="black",fontsize=10)
plt.show()
In [25]:
sns.boxplot(data=df,x="Caffeine_consumption",y="Sleep_efficiency", color="pink")
plt.title("Does caffeine consumption affect sleep?", color="black",fontsize=10)
plt.show()
In [26]:
sns.boxplot(data=df,x="Alcohol_consumption",y="Sleep_efficiency", color="blue")
plt.title("What is the effect of drinking alcohol on sleep efficiency?", color="blue",fontsize=10)
plt.show()
In [27]:
df
Out[27]:
ID Age Gender Bedtime Wakeup_time Sleep_duration Sleep_efficiency REM_sleep_percentage Deep_sleep_percentage Light_sleep_percentage Awakenings Caffeine_consumption Alcohol_consumption Smoking_status Exercise_frequency
0 1 65 Female 2021-03-06 01:00:00 2021-03-06 07:00:00 6.0 0.88 18 70 12 0.0 0.0 0.0 Yes 3.0
1 2 69 Male 2021-12-05 02:00:00 2021-12-05 09:00:00 7.0 0.66 19 28 53 3.0 0.0 3.0 Yes 3.0
2 3 40 Female 2021-05-25 21:30:00 2021-05-25 05:30:00 8.0 0.89 20 70 10 1.0 0.0 0.0 No 3.0
3 4 40 Female 2021-11-03 02:30:00 2021-11-03 08:30:00 6.0 0.51 23 25 52 3.0 50.0 5.0 Yes 1.0
4 5 57 Male 2021-03-13 01:00:00 2021-03-13 09:00:00 8.0 0.76 27 55 18 3.0 0.0 3.0 No 3.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
447 448 27 Female 2021-11-13 22:00:00 2021-11-13 05:30:00 7.5 0.91 22 57 21 0.0 0.0 0.0 No 5.0
448 449 52 Male 2021-03-31 21:00:00 2021-03-31 03:00:00 6.0 0.74 28 57 15 4.0 25.0 0.0 No 3.0
449 450 40 Female 2021-09-07 23:00:00 2021-09-07 07:30:00 8.5 0.55 20 32 48 1.0 NaN 3.0 Yes 0.0
450 451 45 Male 2021-07-29 21:00:00 2021-07-29 04:00:00 7.0 0.76 18 72 10 3.0 0.0 0.0 No 3.0
451 452 18 Male 2021-03-17 02:30:00 2021-03-17 10:00:00 7.5 0.63 22 23 55 1.0 50.0 0.0 No 1.0

452 rows × 15 columns

FEATURE ENGINEERING¶

In [28]:
df.set_index('ID', inplace=True,drop=True)
df.head()
Out[28]:
Age Gender Bedtime Wakeup_time Sleep_duration Sleep_efficiency REM_sleep_percentage Deep_sleep_percentage Light_sleep_percentage Awakenings Caffeine_consumption Alcohol_consumption Smoking_status Exercise_frequency
ID
1 65 Female 2021-03-06 01:00:00 2021-03-06 07:00:00 6.0 0.88 18 70 12 0.0 0.0 0.0 Yes 3.0
2 69 Male 2021-12-05 02:00:00 2021-12-05 09:00:00 7.0 0.66 19 28 53 3.0 0.0 3.0 Yes 3.0
3 40 Female 2021-05-25 21:30:00 2021-05-25 05:30:00 8.0 0.89 20 70 10 1.0 0.0 0.0 No 3.0
4 40 Female 2021-11-03 02:30:00 2021-11-03 08:30:00 6.0 0.51 23 25 52 3.0 50.0 5.0 Yes 1.0
5 57 Male 2021-03-13 01:00:00 2021-03-13 09:00:00 8.0 0.76 27 55 18 3.0 0.0 3.0 No 3.0
In [29]:
df['Gender'] = df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)
In [30]:
# df
In [31]:
df['Smoking_status'] = df['Smoking_status'].apply(lambda x: 0 if x == 'No' else 1)
In [32]:
df
Out[32]:
Age Gender Bedtime Wakeup_time Sleep_duration Sleep_efficiency REM_sleep_percentage Deep_sleep_percentage Light_sleep_percentage Awakenings Caffeine_consumption Alcohol_consumption Smoking_status Exercise_frequency
ID
1 65 1 2021-03-06 01:00:00 2021-03-06 07:00:00 6.0 0.88 18 70 12 0.0 0.0 0.0 1 3.0
2 69 0 2021-12-05 02:00:00 2021-12-05 09:00:00 7.0 0.66 19 28 53 3.0 0.0 3.0 1 3.0
3 40 1 2021-05-25 21:30:00 2021-05-25 05:30:00 8.0 0.89 20 70 10 1.0 0.0 0.0 0 3.0
4 40 1 2021-11-03 02:30:00 2021-11-03 08:30:00 6.0 0.51 23 25 52 3.0 50.0 5.0 1 1.0
5 57 0 2021-03-13 01:00:00 2021-03-13 09:00:00 8.0 0.76 27 55 18 3.0 0.0 3.0 0 3.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
448 27 1 2021-11-13 22:00:00 2021-11-13 05:30:00 7.5 0.91 22 57 21 0.0 0.0 0.0 0 5.0
449 52 0 2021-03-31 21:00:00 2021-03-31 03:00:00 6.0 0.74 28 57 15 4.0 25.0 0.0 0 3.0
450 40 1 2021-09-07 23:00:00 2021-09-07 07:30:00 8.5 0.55 20 32 48 1.0 NaN 3.0 1 0.0
451 45 0 2021-07-29 21:00:00 2021-07-29 04:00:00 7.0 0.76 18 72 10 3.0 0.0 0.0 0 3.0
452 18 0 2021-03-17 02:30:00 2021-03-17 10:00:00 7.5 0.63 22 23 55 1.0 50.0 0.0 0 1.0

452 rows × 14 columns

In [33]:
df = df.drop(['Bedtime','Wakeup_time'],axis=1)
df = df.dropna()
In [34]:
df.tail()
Out[34]:
Age Gender Sleep_duration Sleep_efficiency REM_sleep_percentage Deep_sleep_percentage Light_sleep_percentage Awakenings Caffeine_consumption Alcohol_consumption Smoking_status Exercise_frequency
ID
446 30 1 7.5 0.53 28 20 52 4.0 50.0 2.0 1 1.0
448 27 1 7.5 0.91 22 57 21 0.0 0.0 0.0 0 5.0
449 52 0 6.0 0.74 28 57 15 4.0 25.0 0.0 0 3.0
451 45 0 7.0 0.76 18 72 10 3.0 0.0 0.0 0 3.0
452 18 0 7.5 0.63 22 23 55 1.0 50.0 0.0 0 1.0
In [35]:
y = df['Sleep_efficiency']
X = df.drop('Sleep_efficiency',axis=1)
In [36]:
X
Out[36]:
Age Gender Sleep_duration REM_sleep_percentage Deep_sleep_percentage Light_sleep_percentage Awakenings Caffeine_consumption Alcohol_consumption Smoking_status Exercise_frequency
ID
1 65 1 6.0 18 70 12 0.0 0.0 0.0 1 3.0
2 69 0 7.0 19 28 53 3.0 0.0 3.0 1 3.0
3 40 1 8.0 20 70 10 1.0 0.0 0.0 0 3.0
4 40 1 6.0 23 25 52 3.0 50.0 5.0 1 1.0
5 57 0 8.0 27 55 18 3.0 0.0 3.0 0 3.0
... ... ... ... ... ... ... ... ... ... ... ...
446 30 1 7.5 28 20 52 4.0 50.0 2.0 1 1.0
448 27 1 7.5 22 57 21 0.0 0.0 0.0 0 5.0
449 52 0 6.0 28 57 15 4.0 25.0 0.0 0 3.0
451 45 0 7.0 18 72 10 3.0 0.0 0.0 0 3.0
452 18 0 7.5 22 23 55 1.0 50.0 0.0 0 1.0

388 rows × 11 columns

In [37]:
y
Out[37]:
ID
1      0.88
2      0.66
3      0.89
4      0.51
5      0.76
       ... 
446    0.53
448    0.91
449    0.74
451    0.76
452    0.63
Name: Sleep_efficiency, Length: 388, dtype: float64

Model Evaluation¶

In [38]:
def train_evaluate_model(model, X, y, n_splits=5, is_nn = False, epochs=100):
    '''
    arguments:
        model -- A scikit-learn estimator instance for regression
        X -- The input features for the model
        y -- The target variable for the model
        n_splits -- The number of folds to use in the KFold cross-validation (default=5)

    Returns:
        a dictionary of evaluation metrics for the model
    '''
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize lists to store the evaluation metrics
    mse_scores = []
    r2_scores = []

    # Use stratified k-fold cross-validation to evaluate the model
    for train_index, test_index in kf.split(X, y):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        # Fit the model to the training data
        if is_nn:
            model.fit(X_train, y_train , epochs=epochs,verbose=0)
        model.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = model.predict(X_test)

        # Compute the evaluation metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Append the scores to the lists
        mse_scores.append(mse)
        r2_scores.append(r2)

    # Compute the mean and standard deviation of the evaluation metrics
    mse_mean = np.mean(mse_scores)
    mse_std = np.std(mse_scores)
    r2_mean = np.mean(r2_scores)
    r2_std = np.std(r2_scores)

    # Create a dataframe to store the evaluation metrics
    eval_df = pd.DataFrame({'Mean_Squared_Error': [mse_mean], 'MSE Std Dev': [mse_std], 'R_squared': [r2_mean], 'R2 Std Dev': [r2_std]})

    return eval_df

LINEAR REGRESSION¶

In [39]:
lr = LinearRegression()
results = train_evaluate_model(lr, X, y, 10)
results.index = ['LinearRegression']

results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
Out[39]:
  Mean_Squared_Error MSE Std Dev R_squared R2 Std Dev
LinearRegression 0.003825 0.000643 0.779134 0.062640

DECISION TREES¶

In [40]:
decision_tree = DecisionTreeRegressor()
decision_tree_results = train_evaluate_model(decision_tree, X, y, 10)

decision_tree_results.index = ['DecisionTree']

results = results.append(decision_tree_results)
In [41]:
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
Out[41]:
  Mean_Squared_Error MSE Std Dev R_squared R2 Std Dev
LinearRegression 0.003825 0.000643 0.779134 0.062640
DecisionTree 0.004250 0.000779 0.757620 0.063238

K NEAREST NEIGHBOURS¶

In [42]:
KNN = KNeighborsRegressor(n_neighbors=7)
knn = train_evaluate_model(KNN, X, y, 10)
knn.index =['KNearsNeighbors']
results = results.append(knn)
In [43]:
results.sort_values(by='Mean_Squared_Error',ascending=False).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
Out[43]:
  Mean_Squared_Error MSE Std Dev R_squared R2 Std Dev
KNearsNeighbors 0.004897 0.000606 0.720204 0.056080
DecisionTree 0.004250 0.000779 0.757620 0.063238
LinearRegression 0.003825 0.000643 0.779134 0.062640

RANDOM FOREST REGRESSOR¶

In [44]:
rfr= RandomForestRegressor()
rfr_result = train_evaluate_model(rfr, X, y, 10)
rfr_result.index = ['RandomForest']

results = results.append(rfr_result)
In [45]:
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
Out[45]:
  Mean_Squared_Error MSE Std Dev R_squared R2 Std Dev
RandomForest 0.002458 0.000620 0.858697 0.045146
LinearRegression 0.003825 0.000643 0.779134 0.062640
DecisionTree 0.004250 0.000779 0.757620 0.063238
KNearsNeighbors 0.004897 0.000606 0.720204 0.056080

XGBOOST¶

In [46]:
xgboost = xgb.XGBRegressor()
xgboost_result = train_evaluate_model(xgboost, X, y, 10)
xgboost_result.index = ['XGBoost']

results = results.append(xgboost_result)
In [47]:
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
Out[47]:
  Mean_Squared_Error MSE Std Dev R_squared R2 Std Dev
RandomForest 0.002458 0.000620 0.858697 0.045146
XGBoost 0.002716 0.000724 0.842738 0.054822
LinearRegression 0.003825 0.000643 0.779134 0.062640
DecisionTree 0.004250 0.000779 0.757620 0.063238
KNearsNeighbors 0.004897 0.000606 0.720204 0.056080

CATBOOST¶

In [48]:
catboost = CatBoostRegressor(verbose=False)
catboost_result = train_evaluate_model(catboost, X, y, 10)
catboost_result.index = ['Catboost']

results = results.append(catboost_result)
In [49]:
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
Out[49]:
  Mean_Squared_Error MSE Std Dev R_squared R2 Std Dev
Catboost 0.002388 0.000468 0.863124 0.036016
RandomForest 0.002458 0.000620 0.858697 0.045146
XGBoost 0.002716 0.000724 0.842738 0.054822
LinearRegression 0.003825 0.000643 0.779134 0.062640
DecisionTree 0.004250 0.000779 0.757620 0.063238
KNearsNeighbors 0.004897 0.000606 0.720204 0.056080

LIGHT GBM¶

In [50]:
lgbmr = LGBMRegressor()
lgbmr_result = train_evaluate_model(lgbmr, X, y, 10)
lgbmr_result.index = ['LGBM']

results = results.append(lgbmr_result)
In [51]:
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
Out[51]:
  Mean_Squared_Error MSE Std Dev R_squared R2 Std Dev
LGBM 0.002313 0.000537 0.867179 0.040736
Catboost 0.002388 0.000468 0.863124 0.036016
RandomForest 0.002458 0.000620 0.858697 0.045146
XGBoost 0.002716 0.000724 0.842738 0.054822
LinearRegression 0.003825 0.000643 0.779134 0.062640
DecisionTree 0.004250 0.000779 0.757620 0.063238
KNearsNeighbors 0.004897 0.000606 0.720204 0.056080

ENSEMBLING¶

In [52]:
regressors = [('lgbmr', LGBMRegressor()),('catboost', CatBoostRegressor(verbose=False)), ('xgb', xgb.XGBRegressor())]
voting_regressor = VotingRegressor(regressors)
voting_regressor_result = train_evaluate_model(voting_regressor, X, y, 10)
voting_regressor_result.index = ['Ensemble']

results = results.append(voting_regressor_result)
In [53]:
results.sort_values(by='Mean_Squared_Error',ascending=True).style.background_gradient(cmap = sns.color_palette("ch:s=-.2,r=.6", as_cmap=True))
Out[53]:
  Mean_Squared_Error MSE Std Dev R_squared R2 Std Dev
Ensemble 0.002290 0.000536 0.868065 0.040479
LGBM 0.002313 0.000537 0.867179 0.040736
Catboost 0.002388 0.000468 0.863124 0.036016
RandomForest 0.002458 0.000620 0.858697 0.045146
XGBoost 0.002716 0.000724 0.842738 0.054822
LinearRegression 0.003825 0.000643 0.779134 0.062640
DecisionTree 0.004250 0.000779 0.757620 0.063238
KNearsNeighbors 0.004897 0.000606 0.720204 0.056080

FEATURE IMPORTANCE¶

In [60]:
sns.set_style('whitegrid')
sns.set_context('poster')
sns.set_palette('colorblind')
sns.set(rc={'figure.figsize':(12,8)})
sns.barplot(x=xgboost.feature_importances_, y=X.columns)
plt.title('FEATURE IMPORTANCE')
plt.show()

MODEL COMPARISION¶

In [54]:
models = results.T.columns.values

Mean Squared Error¶

In [55]:
fig = px.bar(
    x=results.iloc[:9, 0].values,
    y=models,
    orientation='h',
    color=results['Mean_Squared_Error'].iloc[:9],
    color_continuous_scale='rdpu_r',  
    template="seaborn"
)

fig.update_layout(width=800, height=600,
                 xaxis=dict(title='Mean_Squared_Error'),
                 yaxis=dict(title="Models"))

fig.show()

R-2¶

In [56]:
fig = px.bar(
    x=results.iloc[:9, 2].values,
    y=models,
    orientation='h',
    color=results['R_squared'].iloc[:9],
    color_continuous_scale='rdpu',  
    template="seaborn"
)

fig.update_layout(width=800, height=600,
                 xaxis=dict(title='R_squared'),
                 yaxis=dict(title="Models"))

fig.show()

MSE- STD DEV¶

In [57]:
fig = px.bar(
    x=results.iloc[:9, 1].values,
    y=models,
    orientation='h',
    color=results['MSE Std Dev'].iloc[:9],
    color_continuous_scale='rdpu_r',  
    template="seaborn"
)

fig.update_layout(width=800, height=600,
                 xaxis=dict(title='MSE Std Dev'),
                 yaxis=dict(title="Models"))

fig.show()

R-2 STD DEV¶

In [58]:
fig = px.bar(
    x=results.iloc[:9, 3].values,
    y=models,
    orientation='h',
    color=results['R2 Std Dev'].iloc[:9],
    color_continuous_scale='rdpu_r',  
    template="seaborn"
)

fig.update_layout(width=800, height=600,
                 xaxis=dict(title='R2 Std Dev'),
                 yaxis=dict(title="Models"))

fig.show()
In [ ]: